Overview

*Project title: Anime Recommendation System
*Repository: {replace this with your git repository link}
*Team member(s): Chen, Wan Qi (wac45@pitt.edu), Clarchick, Victoria (vlc24@pitt.edu), Gupta, Abhibha (abg96@pitt.edu)

Abstract This is a reflection on the creation of a recommendation system using R. The recommendation system was for the purpose of being able to suggest an anime to different users. The dataset was first explored through a various plots (Eg: cor plot) and the most important features were taken into consideration for the recommendation system. The different variables were evaluated as well to determine the influence between the variables. Each variable was run through an extraction and frequency function and then analyzed. After the data exploration the dataset was run through the recommendation system. We used the ‘hybrid’ approach that weighs different methods based on popularity, randomness and recommendation. The results for our method are provided in the subsequent sections.

Import and Install libraries

library(dplyr)
library(ggplot2)
library(tidyverse)
library(GGally)
library(caret)
library(forcats)
library(recommenderlab)
library(data.table)
library(reshape2)
library(maditr)
# library(ggstatsplot) #for correlation in task 5
csv_Rating <- read.csv(file = "/Users/abhibhagupta/Desktop/rating_complete.csv")
csv_Anime <- read.csv(file = "/Users/abhibhagupta/Desktop/anime.csv")
csv_Genre <- read.csv(file = "/Users/abhibhagupta/Desktop/anime_genre_year3.csv")
df_anime <- csv_Anime
df_rating <- csv_Rating
anime_data <- csv_Anime
rating_data <- csv_Rating
df_genre <- csv_Genre

#Data Exploration

##This is for preprocessing the data
#Read data
##df_anime <- read.csv(file = "C:/Users/set4s/Documents/Data Mining/Data Files/anime.csv")
#df_anime
##df_rating <- read.csv(file = "C:/Users/set4s/Documents/Data Mining/Data Files/rating_complete.csv")
#removing unknowns
###summary(df_rating)
#removing unknowns
df_anime$Ranked<-as.integer(df_anime$Ranked)
df_anime$Score<-as.integer(df_anime$Score)
df_anime$MAL_ID<-as.integer(df_anime$MAL_ID)
df_anime_clean <- as.data.frame(df_anime) %>% na.omit(df_anime)
colnames(df_anime_clean)[1] ="anime_id"
df_rating_clean <- as.data.frame(df_rating)%>% na.omit(df_rating)
df_combine<- left_join(df_rating_clean, df_anime_clean, by="anime_id")
df_com_clean<- df_combine%>%select('user_id':'Plan.to.Watch')
df_com_clean<- as.data.frame(df_com_clean)%>% na.omit(df_com_clean)
colnames(df_com_clean)[3] ="user_rating"
df_com_clean_1000<-df_com_clean[df_com_clean$Ranked<1000,]
df_com_clean_1000<-as.data.frame(df_com_clean_1000)
df_com_clean_20<-df_com_clean[df_com_clean$Ranked<20,]
df_com_clean_20<-as.data.frame(df_com_clean_20)
df_com_clean_100<-df_com_clean[df_com_clean$Ranked<100,]
df_com_clean_100<-as.data.frame(df_com_clean_100)
lowerFn <- function(data, mapping, method = "lm") {
  ggplot(data = data, mapping = mapping) +
    geom_point(colour = "slategray4", size = 2, shape = 18) +
    geom_smooth(method = method, color = "coral3") +
    theme_minimal() +
    theme(axis.text.x = element_text(size = 8, angle = 45))
}
df_anime_clean %>% arrange(df_anime_clean$Ranked) %>% slice(1:1000) %>%
  select( Popularity, Favorites, Score, Ranked, Watching, Members) %>%mutate_all(as.numeric) %>%
  ggpairs(lower = list(continuous = wrap(lowerFn, method = "lm")),
    diag = list(continuous = wrap("barDiag", fill = 'skyblue1', colour = "skyblue4")),
    upper = list(continuous = wrap("cor", size = 5)),cardinality_threshold=50,
    progress = FALSE)

Proportionate: Popularity/Rank Watching/Members Score: Favorite, Members Disproportionate: Rank: Favorites, Score,Members Popularity: Members, Score Important Var: Score and Rank

###View Genres
#df_com_clean_1000%>% group_by(Genres) %>% 
#  summarize(count = n())

###Types of Genres
genres <- c("Action", "Adventure", "Cars", "Comedy", "Dementia", "Demons", "Drama", "Ecchi", "Fantasy", "Game", "Harem", "Historical", "Horror", "Josei", "Kids", "Magic", "Martial Arts", "Mecha", "Military", "Music", "Mystery", "Parody", "Police", "Psychological", "Romance", "Samurai", "School", "Sci-Fi", "Seinen", "Shoujo", "Shounen", "Slice of Life", "Space", "Sports", "Super Power", "Supernatural", "Thriller", "Vampire")

###Function to split Data
genres_df <- data.frame(
  Genres = genres,
  Count = sapply(genres, function(x) {
    sum(str_detect(df_com_clean_1000$Genres, x))
  })
)
##Arrange data
genres_df_10<-genres_df%>% arrange(desc(Count)) %>%slice(1:10)
genres_df_5<-genres_df%>% arrange(desc(Count)) %>%slice(1:5)
###Plot the Genres
genres_df %>%
  ggplot(aes(x = Count, y = Genres)) +
  ggtitle("Count of Animes within Genres") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Count of Animes") +
  ylab("Genres") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))

genres_df_10 %>%
  ggplot(aes(x = Count, y = fct_reorder(Genres, Count))) +
  ggtitle("Top 10 Count of Animes within Genres") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Count of Animes") +
  ylab("Genres") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Count), vjust = -0.2)

genres_df_2 <- data.frame(
  Genres = genres,
  Rating = sapply(genres, function(x) {
    mean(df_com_clean_1000[str_detect(df_com_clean_1000$Genres, x),]$user_rating)
  })
)
genres_df_2_5<-genres_df_2%>% arrange(desc(Rating)) %>%slice(1:10)
genres_df_2 %>%
  ggplot(aes(x = Rating, y = Genres)) +
  ggtitle("Average Ratings for Genre") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Average ratings") +
  ylab("Genres") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))

genres_df_2_5 %>%
  ggplot(aes(x = Rating, y = fct_reorder(Genres,Rating))) +
  ggtitle("Top 10 Average Ratings for Genre") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Average ratings") +
  ylab("Genres") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Rating), vjust = -0.2)

How many users watched this Genre Average rating for Genre 1 Genre that overlaps within top 10 Shows that most watched Genres are not highly rated

# Rating count
#df_com_clean_1000 %>%
#  group_by(user_rating) %>%
#  summarize(count = n())

df_com_clean %>%
  group_by(user_rating) %>%
  summarize(count = n()) %>%
  ggplot(aes(x = user_rating, y = count)) +
  geom_bar(stat = "identity", fill = "#8888ff") +
  ggtitle("Rating Distribution") +
  xlab("Rating") +
  ylab("Occurrences Count") +
  scale_x_continuous(n.breaks = 10) +
  theme(axis.title.x = element_text(vjust = -5, face = "bold"), 
        axis.title.y = element_text(vjust = 10, face = "bold"), 
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm")) +
  geom_text(aes(label = count, vjust = -0.2))

df_com_clean_1000 %>%
  group_by(user_rating) %>%
  summarize(count = n()) %>%
  ggplot(aes(x = user_rating, y = count)) +
  geom_bar(stat = "identity", fill = "#8888ff") +
  ggtitle("Rating Distribution") +
  xlab("Rating") +
  ylab("Occurrences Count") +
  scale_x_continuous(n.breaks = 10) +
  theme(axis.title.x = element_text(vjust = -5, face = "bold"), 
        axis.title.y = element_text(vjust = 10, face = "bold"), 
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = count, vjust = -0.2))

Out of the top 1000 ranked anime most have been rated by aprox 25000 users so the average rating is well distributed

#df_com_clean_1000 %>%
#  group_by(anime_id) %>% 
#  summarize(count = n()) %>%
#  slice_head(n = 10)


df_com_clean_1000 %>%
  group_by(anime_id) %>%
  summarize(count = n()) %>%
  ggplot(aes(x = anime_id, y = count)) +
  geom_point(alpha = 0.2, color = "#4020dd") +
  geom_smooth(color = "red") +
  ggtitle("Number of Ratings per anime") +
  xlab("anime id") +
  ylab("Number of ratings") +
  scale_x_continuous(n.breaks = 10) +
  theme(axis.title.x = element_text(vjust = -5, face = "bold"), 
        axis.title.y = element_text(vjust = 10, face = "bold"), 
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))

df_com_clean_1000 %>%
  group_by(anime_id) %>%
  summarize(count = n()) %>%
  ggplot(aes(x = count)) +
  geom_histogram(fill = "#8888ff", color = "#4020dd") +
  ggtitle("anime' rating histogram") +
  xlab("Number of ratings") +
  ylab("anime id") +
  scale_x_log10(n.breaks = 10) +
  theme(axis.title.x = element_text(vjust = -5, face = "bold"), 
        axis.title.y = element_text(vjust = 10, face = "bold"), 
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))

df_com_clean_1000 %>%
  group_by(user_id) %>%
  summarize(count = n()) %>%
  ggplot(aes(x = count)) +
  geom_histogram(fill = "#8888ff", color = "#4020dd") +
  ggtitle("Users' rating histogram") +
  xlab("Rating count") +
  ylab("Number of Users that Rated") +
  scale_x_log10(n.breaks = 10) +
  theme(axis.title.x = element_text(vjust = -5, face = "bold"), 
        axis.title.y = element_text(vjust = 10, face = "bold"), 
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))

limit <- 50
user_anime_matrix <- df_com_clean_100 %>% 
  filter(user_id %in% sample(unique(df_com_clean_100$user_id), limit)) %>%
  select(user_id, anime_id, user_rating) %>%
  mutate(rating = 1) %>%
  spread(anime_id, user_rating) %>% 
  select(sample(ncol(.), limit)) %>% 
  as.matrix() %>% 
  t(.) 

user_anime_matrix %>% 
  image(1:limit, 1:limit,., xlab = "Anime", ylab = "User") +
  abline(h = 0:limit + 0.5, v = 0:limit + 0.5, col = "grey") +
  title(main = list("User x Anime matrix w/ Rating", cex = 1, font = 2))

## integer(0)
top_studios <- df_com_clean_1000 %>% 
  group_by(Studios) %>%
  summarise(Count = n()) %>% 
  top_n(20, wt = Count)

top_studios<-top_studios%>%arrange(Count)
#print(top_studios)
###Types of Genres
studios <- c("Manglobe", "CoMix Wave Films", "P.A. Works", "Toei Animation", "TMS Entertainment", "Brain's Base", "Studio Deen", "White Fox", "Wit Studio", "ufotable", "Studio Pierrot", "Studio Ghibli", "Shaft", "Sunrise", "Production I.G", "J.C.Staff", "A-1 Pictures", "Kyoto Animation", "Bones", "Madhouse")

###Function to split Data
studios_df <- data.frame(
  Studios = studios,
  Count = sapply(studios, function(x) {
    sum(str_detect(df_com_clean_1000$Studios, x))
  })
)
###Plot the Studios
studios_df_10<-studios_df %>% arrange(desc(Count)) %>%slice(1:10)
studios_df_10 %>%
  ggplot(aes(x = Count, y = fct_reorder(Studios,Count))) +
  ggtitle("Top 10 Studios Based on the Number of Users that Rated") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Number of ratings") +
  ylab("Studios") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Count), vjust = -0.2)

studios_df_2 <- data.frame(
  Studios = studios,
  Rating = sapply(studios, function(x) {
    mean(df_com_clean_1000[str_detect(df_com_clean_1000$Studios, x),]$user_rating)
  })
)
#print(studios_df_2)
studios_df_2_10<-studios_df_2 %>% arrange(desc(Rating)) %>%slice(1:10)
studios_df_2_5<-studios_df_2 %>% arrange(desc(Rating)) %>%slice(1:5)
studios_df_2_10 %>%
  ggplot(aes(x = Rating, y = fct_reorder(Studios,Rating))) +
  ggtitle("Top 10 Studios by Average Rating") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Average ratings") +
  ylab("Studios") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Rating), vjust = -0.2)

4 Studios that overlap The higher rated studios are watched less Showing that studios that mass produce do not have high quality ratings

top_Producers <- df_com_clean_100 %>% 
  group_by(Producers) %>%
  summarise(Count = n()) 

top_Producers<-top_Producers%>%arrange(Producers)
###Types of Producers
producers <- c("A-1 Pictures", "ABC Animation", "Amuse", "Animation Do", "Animax", "Aniplex", "Asatsu DK", "Ashi Production", "Asmik Ace", "AT-X", "Audio Highs", "Avex Pictures", "Bandai", "Bandai Namco Entertainment", "Bandai Visual", "Banpresto", "BS Fuji", "BS11", "CA-Cygames Anime Fund", "Crunchyroll SC Anime Fund", "Cyclone Graphics", "Cygames", "DAX Production", "Delfi Sound", "Dentsu", "Docomo Anime Store", "dugout", "FBC", "Fuji TV", "Fujipacific Music", "Global Solutions", "Good Smile Company", "Hakuhodo DY Media Partners", "Hakuhodo DY Music & Pictures", "Hakusensha", "Half H.P Studio", "Hiroshima Television", "JR East Marketing & Communications", "Kadokawa", "Kadokawa Media House", "Kadokawa Pictures Japan", "Kadokawa Shoten", "Kanetsu Investment", "Kansai Telecasting", "K-Factory", "King Records", "KlockWorx", "Kodansha", "Konami", "Kyoraku Industrial Holdings", "Lantis", "Mag Garden", "Mainichi Broadcasting System", "Marvelous", "Media Factory", "Medicos Entertainment", "Miracle Bus", "Miracle Robo", "Mirai-Kojo", "Movic", "My Theater D.D.", "Nagoya Broadcasting Network", "NewGin", "NHK", "Nihon Ad Systems", "Nippon Television Music", "Nippon Television Network", "Nitroplus", "Notes", "Pony Canyon", "Pony Canyon Enterprise", "Quaras", "Rakuonsha", "Sakura Create", "Science SARU", "Seikaisha", "Shochiku", "Shogakukan", "Shogakukan-Shueisha Productions", "Shueisha", "SKY Perfect Well Think", "Sony Music Communications", "Sony Music Entertainment", "Sound Team Don Juan", "Square Enix", "Studio Hibari", "Studio Jack", "Studio Moriken", "Takeshobo", "TAP", "TBS", "TC Entertainment", "Techno Sound", "Toho", "TOHO animation", "Tohokushinsha Film Corporation", "Tokuma Shoten", "Tokyo MX", "Toy's Factory", "Trinity Sound", "Twin Engine", "VAP", "Victor Entertainment", "voque ting", "Warner Bros. Japan", "Yomiuri Telecasting", "YTV", "Annapuru", "Avex Entertainment", "d-rights", "Frontier Works", "Kitty Films", "Madhouse", "Production I.G", "Starchild Records", "TMS Music", "TV Asahi", "TV Tokyo", "Unknown")
  
producers_df <- data.frame(
  Producers = producers,
  Count = sapply(producers, function(x) {
    sum(str_detect(df_com_clean_100$Producers, x))
  })
)
###Plot the Producers
producers_df_10<-producers_df %>% arrange(desc(Count)) %>%slice(1:20)
producers_df_10 %>%
  ggplot(aes(x = Count, y = fct_reorder(Producers,Count))) +
  ggtitle("Top 20 Producers Based on the Number of Users that Rated") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Number of ratings") +
  ylab("Producers") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Count), vjust = -0.2)

producers_df_2 <- data.frame(
  Producers = producers,
  Rating = sapply(producers, function(x) {
    mean(df_com_clean_100[str_detect(df_com_clean_100$Producers, x),]$user_rating)
  })
)
producers_df_2_10<-producers_df_2 %>% arrange(desc(Rating)) %>%slice(1:20)
producers_df_2_5<-producers_df_2 %>% arrange(desc(Rating)) %>%slice(1:5)
producers_df_2_10 %>%
  ggplot(aes(x = Rating, y = fct_reorder(Producers,Rating))) +
  ggtitle("Top 20 Producers by Average Rating") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Average ratings") +
  ylab("Producers") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Rating), vjust = -0.2)

This category had to be expanded out to top 20 to find 1 common Producer Indicates like studios that there are many producers but within the most watched and highly ranked there are almost no commonalities

top_Licensors <- df_com_clean_100 %>% 
  group_by(Licensors) %>%
  summarise(Count = n()) 

top_Licensors<-top_Licensors%>%arrange(Licensors)
###Types of Licensors
Licensors <- c("4Kids Entertainment", "Aniplex of America", "Bandai Entertainment", "Central Park Media", "Crunchyroll", "Geneon Entertainment USA", "GKIDS", "Inc.", "Manga Entertainment", "NYAV Post", "Sentai Filmworks", "Tokyopop", "ADV Films", "Discotek Media", "Eleven Arts", "Flatiron Film Company", "Funimation", "NIS America", "Nozomi Entertainment", "Unknown", "VIZ Media", "Walt Disney Studios")
  
Licensors_df <- data.frame(
  Licensors = Licensors,
  Count = sapply(Licensors, function(x) {
    sum(str_detect(df_com_clean_1000$Licensors, x))
  })
)
###Plot the Licensors
Licensors_df_10<-Licensors_df %>% arrange(desc(Count)) %>%slice(1:10)
Licensors_df_5<-Licensors_df %>% arrange(desc(Count)) %>%slice(1:5)
Licensors_df_10 %>%
  ggplot(aes(x = Count, y = fct_reorder(Licensors,Count))) +
  ggtitle("Top 10 Licensors Based on the Number of Users that Rated") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Number of ratings") +
  ylab("Licensors") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Count), vjust = -0.2)

Licensors_df_2 <- data.frame(
  Licensors = Licensors,
  Rating = sapply(Licensors, function(x) {
    mean(df_com_clean_1000[str_detect(df_com_clean_1000$Licensors, x),]$user_rating)
  })
)
Licensors_df_2_10<-Licensors_df_2 %>% arrange(desc(Rating)) %>%slice(1:10)
Licensors_df_2_5<-Licensors_df_2 %>% arrange(desc(Rating)) %>%slice(1:5)
Licensors_df_2_10 %>%
  ggplot(aes(x = Rating, y = fct_reorder(Licensors,Rating))) +
  ggtitle("Top 10 Licensors by Average Rating") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Average ratings") +
  ylab("Licensors") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Rating), vjust = -0.2)

Similar to Studios and Producer Mass Licensing is equivilant to less desired ratings.

Source_df <- df_com_clean_1000 %>% 
  group_by(Source) %>%
  summarise(Count = n()) 
Source_df<- as.data.frame(Source_df)
###Plot the Source
Source_df_10<-Source_df %>% arrange(desc(Count)) %>%slice(1:10)
Source_df_10 %>%
  ggplot(aes(x = Count , y = fct_reorder(Source,Count))) +
  ggtitle("Top 10 Sources VS Number of Users that Rated") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Number of ratings") +
  ylab("Source") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Count), vjust = -0.2)

Source_df_2 <- df_com_clean_1000 %>% 
  group_by(Source) %>%
  summarise_at(vars(user_rating),list(Rating=mean))
Source_df_2_10<-Source_df_2 %>% arrange(desc(Rating)) %>%slice(1:10)
Source_df_2_10  %>%
  ggplot(aes(x = Rating, y = fct_reorder(Source,Rating))) +
  ggtitle("Top 10 Sources by Average Rating") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Average ratings") +
  ylab("Source") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Rating), vjust = -0.2)

Source_df2 <- df_com_clean %>% 
  group_by(Source) %>%
  summarise(Count = n()) 
Source_df2<- as.data.frame(Source_df2)
###Plot the Source
Source_df2_10<-Source_df2 %>% arrange(desc(Count)) %>%slice(1:10)
Source_df2_10 %>%
  ggplot(aes(x = Count , y = fct_reorder(Source,Count))) +
  ggtitle("Top 10 Sources VS Number of Users that Rated") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Number of ratings") +
  ylab("Source") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Count), vjust = -0.2)

Source_df2_2 <- df_com_clean %>% 
  group_by(Source) %>%
  summarise_at(vars(user_rating),list(Rating=mean))
Source_df2_2<- as.data.frame(Source_df2_2)
Source_df2_2_10<-Source_df2_2 %>% arrange(desc(Rating)) %>%slice(1:10)
Source_df2_2_10  %>%
  ggplot(aes(x = Rating, y = fct_reorder(Source,Rating))) +
  ggtitle("Top 10 Sources by Average Rating") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Average ratings") +
  ylab("Source") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Rating), vjust = -0.2)

Ranked 1-1000 3 common Sources Entire DataSet 6 common Sources Sources that are most watched are not within the highest ratings yet the lower Sources watched are much higher rated

Shows that sources that are watched most often do not have high ratings

Type_df <- df_com_clean_1000 %>% 
  group_by(Type) %>%
  summarise(Count = n()) 
Type_df<- as.data.frame(Type_df)
###Plot the Type
Type_df %>%
  ggplot(aes(x = Count , y = fct_reorder(Type,Count))) +
  ggtitle("Type VS Number of Users that Rated") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Number of ratings") +
  ylab("Type") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Count), vjust = -0.2)

Type_df_2 <- df_com_clean_1000 %>% 
  group_by(Type) %>%
  summarise_at(vars(user_rating),list(Rating=mean))
Type_df_2 %>%
  ggplot(aes(x = Rating, y = fct_reorder(Type,Rating))) +
  ggtitle("Type by Average Rating") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Average ratings") +
  ylab("Type") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Rating), vjust = -0.2)

Type_df2 <- df_com_clean %>% 
  group_by(Type) %>%
  summarise(Count = n()) 
Type_df2<- as.data.frame(Type_df2)
###Plot the Type
Type_df2 %>%
  ggplot(aes(x = Count , y = fct_reorder(Type,Count))) +
  ggtitle("Type VS Number of Users that Rated") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Number of ratings") +
  ylab("Type") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Count), vjust = -0.2)

Type_df2_2 <- df_com_clean %>% 
  group_by(Type) %>%
  summarise_at(vars(user_rating),list(Rating=mean))
Type_df2_2 %>%
  ggplot(aes(x = Rating, y = fct_reorder(Type,Rating))) +
  ggtitle("Type by Average Rating") +
  geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
  xlab("Average ratings") +
  ylab("Type") +
  theme(plot.title = element_text(vjust = 3.5),
        axis.title.x = element_text(vjust = -5, face = "bold"),
        axis.title.y = element_text(vjust = 10, face = "bold"),
        axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
        axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
        plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
  geom_text(aes(label = Rating), vjust = -0.2)

Ranked 1-1000 top 2 are the for both most rated and highest ratings Entire DataSet top 2 are the only changes users will watch TV much more than Movies but the rating is almost the same

##Visualization The distribution of the all anime

df_anime_remove <- subset(csv_Anime, !grepl("Unknown", Genres))
#df_anime_remove
# check score/popularity types
df_anime_remove$Score <- as.numeric(df_anime_remove$Score)
df_anime_remove$Ranked <- as.numeric(df_anime_remove$Ranked)
typeof(df_anime_remove$Ranked)
## [1] "double"
#Relationships between score and popularity
#linear regression
lm_score_prop <-
  lm(df_anime_remove$Score ~ df_anime_remove$Popularity)
#Visualization
ggplot(df_anime_remove, aes(df_anime_remove$Popularity, df_anime_remove$Score)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)

ggplot(df_anime_remove, aes(df_anime_remove$Ranked, df_anime_remove$Score)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)

# ggscatterstats(data = df_anime_remove, x = df_anime_remove$Popularity, y = df_anime_remove$Score)
df_anime_remove %>% 
  ggplot(aes(x = Score)) + 
  geom_histogram(fill="blue", binwidth = 0.1) +
  scale_x_continuous(breaks = seq(0, 10, by = 1)) + 
  labs(x = "score", y = "number of anime")

Treemaps 1. Genre counts

genre <- c()
for (i in df_anime_remove['Genres'])
{
  for (j in (strsplit(i, ",")))
  {
    j <- gsub("[\" ]", "", j)
    #print(j)
    #break
    genre <- append(genre, j)
    
  }
}
counts <- as.data.frame(table(genre))
head(counts)
##       genre Freq
## 1    Action 3888
## 2 Adventure 2957
## 3      Cars  133
## 4    Comedy 6029
## 5  Dementia  512
## 6    Demons  501
#install.packages('treemapify')
library(treemap)
head(counts)
##       genre Freq
## 1    Action 3888
## 2 Adventure 2957
## 3      Cars  133
## 4    Comedy 6029
## 5  Dementia  512
## 6    Demons  501
treemap(counts, index = c("genre", "Freq"), vSize = "Freq", vColor = "Freq", type = "value")

Preparing data #finding the type for each anime.

type <- c()

for (i in df_anime_remove['Type'])
{
  for (j in (strsplit(i, ",")))
  {
    j <- gsub("[\" ]", "", j)
    #print(j)
    #break
    type <- append(type, j)
    
  }
}

#finding counts for each type

type_counts <- as.data.frame(table(type))
(type_counts)
##      type Freq
## 1   Movie 2995
## 2   Music 1469
## 3     ONA 1900
## 4     OVA 3890
## 5 Special 2218
## 6      TV 4991
## 7 Unknown   36

#finding the source for each anime.

source <- c()

for (i in df_anime_remove['Source'])
{
  for (j in (strsplit(i, ",")))
  {
    j <- gsub("[\" ]", "", j)
    #print(j)
    #break
    source <- append(source, j)
    
  }
}

#finding counts for each source

source_counts <- as.data.frame(table(source))
head(source_counts)
##         source Freq
## 1  4-komamanga  288
## 2         Book  112
## 3     Cardgame   64
## 4 Digitalmanga   15
## 5         Game  879
## 6   Lightnovel  768
  1. Type counts
treemap(type_counts, index = c("type", "Freq"), vSize = "Freq", vColor = "Freq", type = "value")

  1. Source counts
treemap(source_counts, index = c("source", "Freq"), vSize = "Freq", vColor = "Freq", type = "value")

Studios with the Top 20 most animes #finding the unique source for each anime.

studios <- c()

for (i in df_anime_remove['Studios'])
{
  for (j in (strsplit(i, ",")))
  {
    j <- gsub("[\" ]", "", j)
    #print(j)
    #break
    studios <- append(studios, j)
    
  }
}

#finding the counts for each studios

studios_counts <- as.data.frame(table(studios))
head(studios_counts)
##           studios Freq
## 1         10Gauge    7
## 2             1IN    1
## 3 2:10AMAnimation    6
## 4    33Collective    1
## 5          3xCube    1
## 6       81Produce    1
studios_counts_remove <- subset(studios_counts, !grepl("Unknown", studios))
studios_counts_remove %>% 
  top_n(20, wt=Freq) %>%
        ggplot(aes(x=reorder(studios, Freq), y=Freq)) +
        geom_bar(stat='identity', fill="skyblue") + coord_flip(y=c(0, 300)) +
        labs(x="", y="Number of animes") + 
        geom_text(aes(label=Freq), hjust=-0.1, size=3)

genre vs year!!!

#install.packages("ggridges")
library(ggridges)
df_genre %>% 
  ggplot(aes(x = year, y = genre)) + 
  geom_density_ridges() + 
  scale_x_continuous(breaks = seq(1960, 2022, by = 10)) + 
  labs(x = "year", y = "genre") + 
  scale_point_color_hue(l = 40) +
  scale_fill_cyclical(values = c("blue", "green"))

#Reccomendation System

Reading the rating data

##rating_data<- read.csv("/Users/abhibhagupta/Desktop/grad_coursework/sem2/DataMining/data/rating_complete.csv")
head(rating_data)
##   user_id anime_id rating
## 1       0      430      9
## 2       0     1004      5
## 3       0     3010      7
## 4       0      570      7
## 5       0     2762      9
## 6       0      431      8
dim(rating_data)
## [1] 57633278        3

Remove any missing values. -> The dataset doesn’t contain any missing values

rating_data <- rating_data[complete.cases(rating_data), ] 
dim(rating_data)
## [1] 57633278        3

Reading the Anime data which contains information about all animes

##anime_data<- read.csv("/Users/abhibhagupta/Desktop/grad_coursework/sem2/DataMining/data/anime.csv")
head(anime_data)
##   MAL_ID                            Name Score
## 1      1                    Cowboy Bebop  8.78
## 2      5 Cowboy Bebop: Tengoku no Tobira  8.39
## 3      6                          Trigun  8.24
## 4      7              Witch Hunter Robin  7.27
## 5      8                  Bouken Ou Beet  6.98
## 6     15                    Eyeshield 21  7.95
##                                                Genres           English.name
## 1     Action, Adventure, Comedy, Drama, Sci-Fi, Space           Cowboy Bebop
## 2               Action, Drama, Mystery, Sci-Fi, Space Cowboy Bebop:The Movie
## 3   Action, Sci-Fi, Adventure, Comedy, Drama, Shounen                 Trigun
## 4 Action, Mystery, Police, Supernatural, Drama, Magic     Witch Hunter Robin
## 5           Adventure, Fantasy, Shounen, Supernatural Beet the Vandel Buster
## 6                     Action, Sports, Comedy, Shounen                Unknown
##                                 Japanese.name  Type Episodes
## 1                          カウボーイビバップ    TV       26
## 2                 カウボーイビバップ 天国の扉 Movie        1
## 3                                  トライガン    TV       26
## 4 Witch Hunter ROBIN (ウイッチハンターロビン)    TV       26
## 5                                冒険王ビィト    TV       52
## 6                              アイシールド21    TV      145
##                          Aired   Premiered
## 1  Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 2                  Sep 1, 2001     Unknown
## 3  Apr 1, 1998 to Sep 30, 1998 Spring 1998
## 4  Jul 2, 2002 to Dec 24, 2002 Summer 2002
## 5 Sep 30, 2004 to Sep 29, 2005   Fall 2004
## 6  Apr 6, 2005 to Mar 19, 2008 Spring 2005
##                                               Producers
## 1                                         Bandai Visual
## 2                                Sunrise, Bandai Visual
## 3                                  Victor Entertainment
## 4 TV Tokyo, Bandai Visual, Dentsu, Victor Entertainment
## 5                                      TV Tokyo, Dentsu
## 6  TV Tokyo, Nihon Ad Systems, TV Tokyo Music, Shueisha
##                              Licensors        Studios   Source        Duration
## 1     Funimation, Bandai Entertainment        Sunrise Original 24 min. per ep.
## 2          Sony Pictures Entertainment          Bones Original   1 hr. 55 min.
## 3 Funimation, Geneon Entertainment USA       Madhouse    Manga 24 min. per ep.
## 4     Funimation, Bandai Entertainment        Sunrise Original 25 min. per ep.
## 5                              Unknown Toei Animation    Manga 23 min. per ep.
## 6          VIZ Media, Sentai Filmworks         Gallop    Manga 23 min. per ep.
##                           Rating Ranked Popularity Members Favorites Watching
## 1 R - 17+ (violence & profanity)   28.0         39 1251960     61971   105808
## 2 R - 17+ (violence & profanity)  159.0        518  273145      1174     4143
## 3      PG-13 - Teens 13 or older  266.0        201  558913     12944    29113
## 4      PG-13 - Teens 13 or older 2481.0       1467   94683       587     4300
## 5                  PG - Children 3710.0       4369   13224        18      642
## 6      PG-13 - Teens 13 or older  604.0       1003  148259      2066    13907
##   Completed On.Hold Dropped Plan.to.Watch Score.10  Score.9  Score.8 Score.7
## 1    718161   71513   26678        329800 229170.0 182126.0 131625.0 62330.0
## 2    208333    1935     770         57964  30043.0  49201.0  49505.0 22632.0
## 3    343492   25465   13925        146918  50229.0  75651.0  86142.0 49432.0
## 4     46165    5121    5378         33719   2182.0   4806.0  10128.0 11618.0
## 5      7314     766    1108          3394    312.0    529.0   1242.0  1713.0
## 6     78349   14228   11573         30202   9226.0  14904.0  22811.0 16734.0
##   Score.6 Score.5 Score.4 Score.3 Score.2 Score.1
## 1 20688.0  8904.0  3184.0  1357.0   741.0  1580.0
## 2  5805.0  1877.0   577.0   221.0   109.0   379.0
## 3 15376.0  5838.0  1965.0   664.0   316.0   533.0
## 4  5709.0  2920.0  1083.0   353.0   164.0   131.0
## 5  1068.0   634.0   265.0    83.0    50.0    27.0
## 6  6206.0  2621.0   795.0   336.0   140.0   151.0

We are using a subset of the anime data as it is too large to be used on a personal computer.

rating_data = rating_data[1:10000,]
dim(rating_data)
## [1] 10000     3
table(rating_data$user_id)
## 
##   0   1   2   3   4   5   6   7   8  10  11  12  13  14  15  16  17  18  19  20 
##  35 103  51 315 118  43 311  87  18   4 161  60  25  96   9 247 709  41 679  79 
##  21  22  23  24  25  27  28  29  30  31  32  33  34  35  36  37  38  40  41  42 
## 398  63  46   6 152  97  75  58  57   1  44 134 185   4 139  86  79 118 175 674 
##  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  61  62  63 
##   4 395  52 358 285  24 201  72  28 216 493 110 175  23  76  55  86 188  68  38 
##  64  65  66  67  68  70  71  72  73 
## 281  27 100 166  23  78   1 124 471

For our anime recommendation system, we use recommenderlabs to implement it. We have to convert our matrix into a sparse matrix.

ratingMatrix <- dcast(rating_data, user_id ~ anime_id, value.var = "rating")
dim(ratingMatrix)
## [1]   69 3099
ratingMatrix <- as.matrix(ratingMatrix[,-1]) #remove user_ids
ratingMatrix <- as(ratingMatrix, "realRatingMatrix")
str(ratingMatrix)
## Formal class 'realRatingMatrix' [package "recommenderlab"] with 2 slots
##   ..@ data     :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   .. .. ..@ i       : int [1:10000] 3 6 13 18 21 22 41 50 54 58 ...
##   .. .. ..@ p       : int [1:3099] 0 11 17 22 23 24 25 28 31 34 ...
##   .. .. ..@ Dim     : int [1:2] 69 3098
##   .. .. ..@ Dimnames:List of 2
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : chr [1:3098] "1" "5" "6" "7" ...
##   .. .. ..@ x       : num [1:10000] 9 6 9 8 9 9 9 10 9 10 ...
##   .. .. ..@ factors : list()
##   ..@ normalize: NULL

Important parameters that provide us various options for building recommendation systems

recommendation_model <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
names(recommendation_model)
##  [1] "HYBRID_realRatingMatrix"       "ALS_realRatingMatrix"         
##  [3] "ALS_implicit_realRatingMatrix" "IBCF_realRatingMatrix"        
##  [5] "LIBMF_realRatingMatrix"        "POPULAR_realRatingMatrix"     
##  [7] "RANDOM_realRatingMatrix"       "RERECOMMEND_realRatingMatrix" 
##  [9] "SVD_realRatingMatrix"          "SVDF_realRatingMatrix"        
## [11] "UBCF_realRatingMatrix"
lapply(recommendation_model, "[[", "description")
## $HYBRID_realRatingMatrix
## [1] "Hybrid recommender that aggegates several recommendation strategies using weighted averages."
## 
## $ALS_realRatingMatrix
## [1] "Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm."
## 
## $ALS_implicit_realRatingMatrix
## [1] "Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm."
## 
## $IBCF_realRatingMatrix
## [1] "Recommender based on item-based collaborative filtering."
## 
## $LIBMF_realRatingMatrix
## [1] "Matrix factorization with LIBMF via package recosystem (https://cran.r-project.org/web/packages/recosystem/vignettes/introduction.html)."
## 
## $POPULAR_realRatingMatrix
## [1] "Recommender based on item popularity."
## 
## $RANDOM_realRatingMatrix
## [1] "Produce random recommendations (real ratings)."
## 
## $RERECOMMEND_realRatingMatrix
## [1] "Re-recommends highly rated items (real ratings)."
## 
## $SVD_realRatingMatrix
## [1] "Recommender based on SVD approximation with column-mean imputation."
## 
## $SVDF_realRatingMatrix
## [1] "Recommender based on Funk SVD with gradient descend (https://sifter.org/~simon/journal/20061211.html)."
## 
## $UBCF_realRatingMatrix
## [1] "Recommender based on user-based collaborative filtering."

Implement Item Based Collaborative Filtering.

recommendation_model$UBCF_realRatingMatrix$parameters
## $method
## [1] "cosine"
## 
## $nn
## [1] 25
## 
## $sample
## [1] FALSE
## 
## $weighted
## [1] TRUE
## 
## $normalize
## [1] "center"
## 
## $min_matching_items
## [1] 0
## 
## $min_predictive_items
## [1] 0

Exploring Similar Data

Utilizing preferences gathered from numerous other users, collaborative filtering involves recommending animes to consumers. For instance, if user A and user B both enjoy comedy anime, then A will be recommended the anime that B will watch in the future, and vice versa. Therefore, establishing a commonality between the two consumers is necessary for making anime recommendations. We can calculate similarities using a variety of operators, including cosine, pearson, and jaccard, with the aid of recommenderlab.

similarity_mat <- similarity(ratingMatrix[1:4, ],
                               method = "cosine",
                               which = "users")
as.matrix(similarity_mat)
##    1         2         3         4
## 1 NA 1.0000000        NA 1.0000000
## 2  1        NA 0.9961698 0.9958885
## 3 NA 0.9961698        NA 0.9969529
## 4  1 0.9958885 0.9969529        NA
image(as.matrix(similarity_mat), main = "User's Similarities")

In the above matrix, each row and column represents a user. We have taken four users and each cell in this matrix represents the similarity that is shared between the two users.

Now, we delineate the similarity that is shared between anime

anime_similarity <- similarity(ratingMatrix[, 1:4], method =
                                 "cosine", which = "items")
as.matrix(anime_similarity)
##           1         5         6  7
## 1        NA 0.9956802 0.9993698  1
## 5 0.9956802        NA 0.9913553  1
## 6 0.9993698 0.9913553        NA NA
## 7 1.0000000 1.0000000        NA NA
image(as.matrix(anime_similarity), main = "Anime similarity")

Extract the most unique ratings

rating_values <- as.vector(ratingMatrix@data)
unique(rating_values)
##  [1]  0  9  6  8 10  7  5  4  3  1  2

Create a table of ratings that will display the most unique ratings.

Table_of_Ratings <- table(rating_values) # creating a count of movie ratings
Table_of_Ratings
## rating_values
##      0      1      2      3      4      5      6      7      8      9     10 
## 203762     41     40    109    201    552   1033   2240   2596   1758   1430
head(anime_data)
##   MAL_ID                            Name Score
## 1      1                    Cowboy Bebop  8.78
## 2      5 Cowboy Bebop: Tengoku no Tobira  8.39
## 3      6                          Trigun  8.24
## 4      7              Witch Hunter Robin  7.27
## 5      8                  Bouken Ou Beet  6.98
## 6     15                    Eyeshield 21  7.95
##                                                Genres           English.name
## 1     Action, Adventure, Comedy, Drama, Sci-Fi, Space           Cowboy Bebop
## 2               Action, Drama, Mystery, Sci-Fi, Space Cowboy Bebop:The Movie
## 3   Action, Sci-Fi, Adventure, Comedy, Drama, Shounen                 Trigun
## 4 Action, Mystery, Police, Supernatural, Drama, Magic     Witch Hunter Robin
## 5           Adventure, Fantasy, Shounen, Supernatural Beet the Vandel Buster
## 6                     Action, Sports, Comedy, Shounen                Unknown
##                                 Japanese.name  Type Episodes
## 1                          カウボーイビバップ    TV       26
## 2                 カウボーイビバップ 天国の扉 Movie        1
## 3                                  トライガン    TV       26
## 4 Witch Hunter ROBIN (ウイッチハンターロビン)    TV       26
## 5                                冒険王ビィト    TV       52
## 6                              アイシールド21    TV      145
##                          Aired   Premiered
## 1  Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 2                  Sep 1, 2001     Unknown
## 3  Apr 1, 1998 to Sep 30, 1998 Spring 1998
## 4  Jul 2, 2002 to Dec 24, 2002 Summer 2002
## 5 Sep 30, 2004 to Sep 29, 2005   Fall 2004
## 6  Apr 6, 2005 to Mar 19, 2008 Spring 2005
##                                               Producers
## 1                                         Bandai Visual
## 2                                Sunrise, Bandai Visual
## 3                                  Victor Entertainment
## 4 TV Tokyo, Bandai Visual, Dentsu, Victor Entertainment
## 5                                      TV Tokyo, Dentsu
## 6  TV Tokyo, Nihon Ad Systems, TV Tokyo Music, Shueisha
##                              Licensors        Studios   Source        Duration
## 1     Funimation, Bandai Entertainment        Sunrise Original 24 min. per ep.
## 2          Sony Pictures Entertainment          Bones Original   1 hr. 55 min.
## 3 Funimation, Geneon Entertainment USA       Madhouse    Manga 24 min. per ep.
## 4     Funimation, Bandai Entertainment        Sunrise Original 25 min. per ep.
## 5                              Unknown Toei Animation    Manga 23 min. per ep.
## 6          VIZ Media, Sentai Filmworks         Gallop    Manga 23 min. per ep.
##                           Rating Ranked Popularity Members Favorites Watching
## 1 R - 17+ (violence & profanity)   28.0         39 1251960     61971   105808
## 2 R - 17+ (violence & profanity)  159.0        518  273145      1174     4143
## 3      PG-13 - Teens 13 or older  266.0        201  558913     12944    29113
## 4      PG-13 - Teens 13 or older 2481.0       1467   94683       587     4300
## 5                  PG - Children 3710.0       4369   13224        18      642
## 6      PG-13 - Teens 13 or older  604.0       1003  148259      2066    13907
##   Completed On.Hold Dropped Plan.to.Watch Score.10  Score.9  Score.8 Score.7
## 1    718161   71513   26678        329800 229170.0 182126.0 131625.0 62330.0
## 2    208333    1935     770         57964  30043.0  49201.0  49505.0 22632.0
## 3    343492   25465   13925        146918  50229.0  75651.0  86142.0 49432.0
## 4     46165    5121    5378         33719   2182.0   4806.0  10128.0 11618.0
## 5      7314     766    1108          3394    312.0    529.0   1242.0  1713.0
## 6     78349   14228   11573         30202   9226.0  14904.0  22811.0 16734.0
##   Score.6 Score.5 Score.4 Score.3 Score.2 Score.1
## 1 20688.0  8904.0  3184.0  1357.0   741.0  1580.0
## 2  5805.0  1877.0   577.0   221.0   109.0   379.0
## 3 15376.0  5838.0  1965.0   664.0   316.0   533.0
## 4  5709.0  2920.0  1083.0   353.0   164.0   131.0
## 5  1068.0   634.0   265.0    83.0    50.0    27.0
## 6  6206.0  2621.0   795.0   336.0   140.0   151.0

Since there are many columns that we do not need, we use a subset of them.

anime_data <- anime_data[, c("MAL_ID", "Name", "Score", "Genres" )]
summary(anime_data)
##      MAL_ID          Name              Score              Genres         
##  Min.   :    1   Length:17562       Length:17562       Length:17562      
##  1st Qu.: 5954   Class :character   Class :character   Class :character  
##  Median :22820   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :21477                                                           
##  3rd Qu.:35625                                                           
##  Max.   :48492
head(anime_data)
##   MAL_ID                            Name Score
## 1      1                    Cowboy Bebop  8.78
## 2      5 Cowboy Bebop: Tengoku no Tobira  8.39
## 3      6                          Trigun  8.24
## 4      7              Witch Hunter Robin  7.27
## 5      8                  Bouken Ou Beet  6.98
## 6     15                    Eyeshield 21  7.95
##                                                Genres
## 1     Action, Adventure, Comedy, Drama, Sci-Fi, Space
## 2               Action, Drama, Mystery, Sci-Fi, Space
## 3   Action, Sci-Fi, Adventure, Comedy, Drama, Shounen
## 4 Action, Mystery, Police, Supernatural, Drama, Magic
## 5           Adventure, Fantasy, Shounen, Supernatural
## 6                     Action, Sports, Comedy, Shounen

Most Viewed Movies Visualization In this section, we will explore the most viewed animes in our dataset. We will first count the number of views in an anime and then organize them in a table that would group them in descending order.

library(ggplot2)
anime_views <- colCounts(ratingMatrix) # count views for each movie
#print(anime_views)
table_views <- data.frame(anime = names(anime_views),
                      views = anime_views) # create dataframe of views
#table_views
table_views <- table_views[order(table_views$views,
                                decreasing = TRUE), ] # sort by number of views
table_views$title <- NA

for (index in 1:dim(table_views)[1]){
  table_views[index,3] <- as.character(subset(anime_data,
                                         anime_data$MAL_ID == table_views[index,1])$Name)
}
table_views[1:6,]
##       anime views                            title
## 16498 16498    37               Shingeki no Kyojin
## 1535   1535    36                       Death Note
## 199     199    34    Sen to Chihiro no Kamikakushi
## 4224   4224    33                        Toradora!
## 11757 11757    32                 Sword Art Online
## 5114   5114    31 Fullmetal Alchemist: Brotherhood
ggplot(table_views[1:6, ], aes(x = title, y = views)) +
  geom_bar(stat="identity", fill = 'steelblue') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +

  ggtitle("Total Views of the Top Animes")

The figure above shows the number of views for the top 6 most watched anime. The anime Shingeki No Kyojin has the highest number of views.

Performing Data Preparation We will conduct data preparation in the following three steps –

Selecting useful data. Normalizing data. Binarizing the data. For finding useful data in our dataset, we have set the threshold for the minimum number of users who have rated a film as 50. This is also same for minimum number of views that are per film. This way, we have filtered a list of watched films from least-watched ones.

values chosen based on what makes visualisation easier.

anime_ratings <- ratingMatrix[rowCounts(ratingMatrix) > 5,
                             colCounts(ratingMatrix) > 5]
anime_ratings
## 64 x 493 rating matrix of class 'realRatingMatrix' with 5080 ratings.
minimum_movies<- quantile(rowCounts(anime_ratings), 0.98)
minimum_users <- quantile(colCounts(anime_ratings), 0.98)
image(anime_ratings[rowCounts(anime_ratings) > minimum_movies,
                     colCounts(anime_ratings) > minimum_users],
main = "Heatmap of the top users and movies")

average_ratings <- rowMeans(anime_ratings)
qplot(average_ratings, fill=I("steelblue"), col=I("red")) +
  ggtitle("Distribution of the average rating per user")

normalized_ratings <- normalize(anime_ratings)
image(normalized_ratings[rowCounts(normalized_ratings) > minimum_movies,
                          colCounts(normalized_ratings) > minimum_users],
main = "Normalized Ratings of the Top Users")

binary_minimum_animes <- quantile(rowCounts(anime_ratings), 0.95)
binary_minimum_users <- quantile(colCounts(anime_ratings), 0.95)
#movies_watched <- binarize(movie_ratings, minRating = 1)

good_rated_animes <- binarize(anime_ratings, minRating = 3)
image(good_rated_animes[rowCounts(anime_ratings) > binary_minimum_animes,
colCounts(anime_ratings) > binary_minimum_users],
main = "Heatmap of the top users and movies")

sampled_data<- sample(x = c(TRUE, FALSE),
                      size = nrow(anime_ratings),
                      replace = TRUE,
                      prob = c(0.8, 0.2))
training_data <- anime_ratings[sampled_data, ]
testing_data <- anime_ratings[!sampled_data, ]

#First approach: Without features.

recommendation_system <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommendation_system$UBCF_realRatingMatrix$parameters
## $method
## [1] "cosine"
## 
## $nn
## [1] 25
## 
## $sample
## [1] FALSE
## 
## $weighted
## [1] TRUE
## 
## $normalize
## [1] "center"
## 
## $min_matching_items
## [1] 0
## 
## $min_predictive_items
## [1] 0
library(caret)
library(tidyr)
# anime_ratings <- as(rating_data, "realRatingMatrix")


e <- evaluationScheme(anime_ratings, method = "split", train = 0.9, given = 1 , goodRating = 5, k = 10)

getData(e, "train")
## 57 x 493 rating matrix of class 'realRatingMatrix' with 4388 ratings.
getData(e, "unknown")
## 7 x 493 rating matrix of class 'realRatingMatrix' with 685 ratings.
getData(e, "known")
## 7 x 493 rating matrix of class 'realRatingMatrix' with 7 ratings.
recommenders <- list(
RANDOM = list(name = "POPULAR", param = NULL),
POPULAR = list(name = "RANDOM", param = NULL),
RERECOMMEND = list(name = "RERECOMMEND", param = NULL))

# for baseline uncomment this
#weights <- c(0.0, 1.0, 0.0)

#hybrid recommender model
weights <- c(6.0, 1.0, 4.0)



# create a user-based CF recommender using training data

r <- Recommender(data = getData(e, "train"), method = "HYBRID", parameter = list(recommenders = recommenders, weights = weights))

p <- predict(object = r, newdata = getData(e, "known") , type = "ratings")

calcPredictionAccuracy(p, getData(e, "unknown"))
##     RMSE      MSE      MAE 
## 2.083102 4.339315 1.652481
head(calcPredictionAccuracy(p,getData(e, "unknown") , byUser = TRUE))
##       RMSE      MSE       MAE
## 1 1.312018 1.721392 1.0887865
## 2 2.041245 4.166683 1.6682996
## 3 1.120353 1.255192 0.9330583
## 4 1.090787 1.189817 0.7178225
## 5 2.308843 5.330758 1.7427775
## 6 1.032041 1.065108 0.7722257
p <- predict(r, getData(e, "known"), type = "topNList", n = 10)
p
## Recommendations as 'topNList' with n = 10 for 7 users.
model_info <- getModel(r)

class(model_info)
## [1] "list"
model_info
## $recommenders
## $recommenders$RANDOM
## Recommender of type 'POPULAR' for 'realRatingMatrix' 
## learned using 57 users.
## 
## $recommenders$POPULAR
## Recommender of type 'RANDOM' for 'realRatingMatrix' 
## learned using 57 users.
## 
## $recommenders$RERECOMMEND
## Recommender of type 'RERECOMMEND' for 'realRatingMatrix' 
## learned using 57 users.
## 
## 
## $weights
## [1] 0.54545455 0.09090909 0.36363636

Printing the recommendations for the first user.

user1 <- p@items[[1]] # recommendation for the first user
movies_user1 <- p@itemLabels[user1]
movies_user2 <- movies_user1
for (index in 1:10){
  movies_user2[index] <- as.character(subset(anime_data,
                                         anime_data$MAL_ID == movies_user1[index])$Name)
}
movies_user2
##  [1] "Koe no Katachi"               "Ookami Kodomo no Ame to Yuki"
##  [3] "Clannad: After Story"         "Mushishi"                    
##  [5] "Steins;Gate 0"                "Hunter x Hunter (2011)"      
##  [7] "Baccano!"                     "Kimi no Na wa."              
##  [9] "Detroit Metal City"           "Koukyoushihen Eureka Seven"

Second approach: With features

csv_Anime <- read.csv(file = "/Users/abhibhagupta/Desktop/anime.csv")
anime_data <- csv_Anime

head(rating_data)
##   user_id anime_id rating
## 1       0      430      9
## 2       0     1004      5
## 3       0     3010      7
## 4       0      570      7
## 5       0     2762      9
## 6       0      431      8
head(anime_data)
##   MAL_ID                            Name Score
## 1      1                    Cowboy Bebop  8.78
## 2      5 Cowboy Bebop: Tengoku no Tobira  8.39
## 3      6                          Trigun  8.24
## 4      7              Witch Hunter Robin  7.27
## 5      8                  Bouken Ou Beet  6.98
## 6     15                    Eyeshield 21  7.95
##                                                Genres           English.name
## 1     Action, Adventure, Comedy, Drama, Sci-Fi, Space           Cowboy Bebop
## 2               Action, Drama, Mystery, Sci-Fi, Space Cowboy Bebop:The Movie
## 3   Action, Sci-Fi, Adventure, Comedy, Drama, Shounen                 Trigun
## 4 Action, Mystery, Police, Supernatural, Drama, Magic     Witch Hunter Robin
## 5           Adventure, Fantasy, Shounen, Supernatural Beet the Vandel Buster
## 6                     Action, Sports, Comedy, Shounen                Unknown
##                                 Japanese.name  Type Episodes
## 1                          カウボーイビバップ    TV       26
## 2                 カウボーイビバップ 天国の扉 Movie        1
## 3                                  トライガン    TV       26
## 4 Witch Hunter ROBIN (ウイッチハンターロビン)    TV       26
## 5                                冒険王ビィト    TV       52
## 6                              アイシールド21    TV      145
##                          Aired   Premiered
## 1  Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 2                  Sep 1, 2001     Unknown
## 3  Apr 1, 1998 to Sep 30, 1998 Spring 1998
## 4  Jul 2, 2002 to Dec 24, 2002 Summer 2002
## 5 Sep 30, 2004 to Sep 29, 2005   Fall 2004
## 6  Apr 6, 2005 to Mar 19, 2008 Spring 2005
##                                               Producers
## 1                                         Bandai Visual
## 2                                Sunrise, Bandai Visual
## 3                                  Victor Entertainment
## 4 TV Tokyo, Bandai Visual, Dentsu, Victor Entertainment
## 5                                      TV Tokyo, Dentsu
## 6  TV Tokyo, Nihon Ad Systems, TV Tokyo Music, Shueisha
##                              Licensors        Studios   Source        Duration
## 1     Funimation, Bandai Entertainment        Sunrise Original 24 min. per ep.
## 2          Sony Pictures Entertainment          Bones Original   1 hr. 55 min.
## 3 Funimation, Geneon Entertainment USA       Madhouse    Manga 24 min. per ep.
## 4     Funimation, Bandai Entertainment        Sunrise Original 25 min. per ep.
## 5                              Unknown Toei Animation    Manga 23 min. per ep.
## 6          VIZ Media, Sentai Filmworks         Gallop    Manga 23 min. per ep.
##                           Rating Ranked Popularity Members Favorites Watching
## 1 R - 17+ (violence & profanity)   28.0         39 1251960     61971   105808
## 2 R - 17+ (violence & profanity)  159.0        518  273145      1174     4143
## 3      PG-13 - Teens 13 or older  266.0        201  558913     12944    29113
## 4      PG-13 - Teens 13 or older 2481.0       1467   94683       587     4300
## 5                  PG - Children 3710.0       4369   13224        18      642
## 6      PG-13 - Teens 13 or older  604.0       1003  148259      2066    13907
##   Completed On.Hold Dropped Plan.to.Watch Score.10  Score.9  Score.8 Score.7
## 1    718161   71513   26678        329800 229170.0 182126.0 131625.0 62330.0
## 2    208333    1935     770         57964  30043.0  49201.0  49505.0 22632.0
## 3    343492   25465   13925        146918  50229.0  75651.0  86142.0 49432.0
## 4     46165    5121    5378         33719   2182.0   4806.0  10128.0 11618.0
## 5      7314     766    1108          3394    312.0    529.0   1242.0  1713.0
## 6     78349   14228   11573         30202   9226.0  14904.0  22811.0 16734.0
##   Score.6 Score.5 Score.4 Score.3 Score.2 Score.1
## 1 20688.0  8904.0  3184.0  1357.0   741.0  1580.0
## 2  5805.0  1877.0   577.0   221.0   109.0   379.0
## 3 15376.0  5838.0  1965.0   664.0   316.0   533.0
## 4  5709.0  2920.0  1083.0   353.0   164.0   131.0
## 5  1068.0   634.0   265.0    83.0    50.0    27.0
## 6  6206.0  2621.0   795.0   336.0   140.0   151.0

#change column name so that we can merge dataframes to incorporate the important features ‘Score’ and ‘Rank’

names(rating_data)[2] <- "MAL_ID"
head(rating_data)
##   user_id MAL_ID rating
## 1       0    430      9
## 2       0   1004      5
## 3       0   3010      7
## 4       0    570      7
## 5       0   2762      9
## 6       0    431      8
merged_df <- merge(rating_data, anime_data, by = "MAL_ID", all = TRUE)
head(merged_df)
##   MAL_ID user_id rating         Name Score
## 1      1      57      9 Cowboy Bebop  8.78
## 2      1       6      6 Cowboy Bebop  8.78
## 3      1      22      9 Cowboy Bebop  8.78
## 4      1      62     10 Cowboy Bebop  8.78
## 5      1      19      8 Cowboy Bebop  8.78
## 6      1      14      9 Cowboy Bebop  8.78
##                                            Genres English.name
## 1 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
## 2 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
## 3 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
## 4 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
## 5 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
## 6 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
##        Japanese.name Type Episodes                       Aired   Premiered
## 1 カウボーイビバップ   TV       26 Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 2 カウボーイビバップ   TV       26 Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 3 カウボーイビバップ   TV       26 Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 4 カウボーイビバップ   TV       26 Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 5 カウボーイビバップ   TV       26 Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 6 カウボーイビバップ   TV       26 Apr 3, 1998 to Apr 24, 1999 Spring 1998
##       Producers                        Licensors Studios   Source
## 1 Bandai Visual Funimation, Bandai Entertainment Sunrise Original
## 2 Bandai Visual Funimation, Bandai Entertainment Sunrise Original
## 3 Bandai Visual Funimation, Bandai Entertainment Sunrise Original
## 4 Bandai Visual Funimation, Bandai Entertainment Sunrise Original
## 5 Bandai Visual Funimation, Bandai Entertainment Sunrise Original
## 6 Bandai Visual Funimation, Bandai Entertainment Sunrise Original
##          Duration                         Rating Ranked Popularity Members
## 1 24 min. per ep. R - 17+ (violence & profanity)   28.0         39 1251960
## 2 24 min. per ep. R - 17+ (violence & profanity)   28.0         39 1251960
## 3 24 min. per ep. R - 17+ (violence & profanity)   28.0         39 1251960
## 4 24 min. per ep. R - 17+ (violence & profanity)   28.0         39 1251960
## 5 24 min. per ep. R - 17+ (violence & profanity)   28.0         39 1251960
## 6 24 min. per ep. R - 17+ (violence & profanity)   28.0         39 1251960
##   Favorites Watching Completed On.Hold Dropped Plan.to.Watch Score.10  Score.9
## 1     61971   105808    718161   71513   26678        329800 229170.0 182126.0
## 2     61971   105808    718161   71513   26678        329800 229170.0 182126.0
## 3     61971   105808    718161   71513   26678        329800 229170.0 182126.0
## 4     61971   105808    718161   71513   26678        329800 229170.0 182126.0
## 5     61971   105808    718161   71513   26678        329800 229170.0 182126.0
## 6     61971   105808    718161   71513   26678        329800 229170.0 182126.0
##    Score.8 Score.7 Score.6 Score.5 Score.4 Score.3 Score.2 Score.1
## 1 131625.0 62330.0 20688.0  8904.0  3184.0  1357.0   741.0  1580.0
## 2 131625.0 62330.0 20688.0  8904.0  3184.0  1357.0   741.0  1580.0
## 3 131625.0 62330.0 20688.0  8904.0  3184.0  1357.0   741.0  1580.0
## 4 131625.0 62330.0 20688.0  8904.0  3184.0  1357.0   741.0  1580.0
## 5 131625.0 62330.0 20688.0  8904.0  3184.0  1357.0   741.0  1580.0
## 6 131625.0 62330.0 20688.0  8904.0  3184.0  1357.0   741.0  1580.0

keeping the important features.

subset_df <- merged_df[, c("user_id", "MAL_ID", "rating", "Score", "Ranked")]
head(subset_df)
##   user_id MAL_ID rating Score Ranked
## 1      57      1      9  8.78   28.0
## 2       6      1      6  8.78   28.0
## 3      22      1      9  8.78   28.0
## 4      62      1     10  8.78   28.0
## 5      19      1      8  8.78   28.0
## 6      14      1      9  8.78   28.0

normalise the continuous variables.

df_norm <- subset_df
df_norm[c( "Score", "Ranked")] <- lapply(df_norm[, c(  "Score", "Ranked")], as.numeric)

df_norm[ c( "Score", "Ranked")]  <- scale(df_norm[, c( "Score", "Ranked")], center = TRUE, scale = TRUE)

#uniform data type

df_norm$MAL_ID <- as.double(unlist(df_norm$MAL_ID ))
df_norm$user_id <- as.double(unlist(df_norm$user_id))
df_norm$Score  <- as.double(unlist(df_norm$Score ))
df_norm$Ranked <- as.double(unlist(df_norm$Ranked))
df_norm$rating <- as.double(unlist(df_norm$rating))
# remove Na values
df_norm <- na.omit(df_norm)

#convert into sparse matrix

mat <- as(df_norm, "realRatingMatrix")
e <- evaluationScheme(mat, method = "split", train = 0.9, given = 1 , goodRating = 5, k = 10)

getData(e, "train")
## 62 x 2868 rating matrix of class 'realRatingMatrix' with 8253 ratings.
getData(e, "unknown")
## 7 x 2868 rating matrix of class 'realRatingMatrix' with 1471 ratings.
getData(e, "known")
## 7 x 2868 rating matrix of class 'realRatingMatrix' with 7 ratings.
recommenders <- list(
RANDOM = list(name = "POPULAR", param = NULL),
POPULAR = list(name = "RANDOM", param = NULL),
RERECOMMEND = list(name = "RERECOMMEND", param = NULL))

# for baseline uncomment this
#weights <- c(0.0, 1.0, 0.0)

#hybrid recommender model
weights <- c(6.0, 1.0, 4.0)



# create a user-based CF recommender using training data

r <- Recommender(data = getData(e, "train"), method = "HYBRID", parameter = list(recommenders = recommenders, weights = weights))

p <- predict(object = r, newdata = getData(e, "known") , type = "ratings")

calcPredictionAccuracy(p, getData(e, "unknown"))
##     RMSE      MSE      MAE 
## 2.475789 6.129530 2.022432
head(calcPredictionAccuracy(p,getData(e, "unknown") , byUser = TRUE))
##        RMSE       MSE      MAE
## 4  1.499396  2.248190 1.137540
## 15 2.367594  5.605499 2.089749
## 17 2.370959  5.621448 1.841630
## 20 3.198636 10.231273 2.933034
## 28 2.574678  6.628969 2.129784
## 68 2.614120  6.833624 1.560343
p <- predict(r, getData(e, "known"), type = "topNList", n = 10)
p
## Recommendations as 'topNList' with n = 10 for 7 users.
user1 <- p@items[[1]] # recommendation for the first user
movies_user1 <- p@itemLabels[user1]
movies_user2 <- movies_user1
for (index in 1:10){
  movies_user2[index] <- as.character(subset(anime_data,
                                         anime_data$MAL_ID == movies_user1[index])$Name)
}
movies_user2
##  [1] "Heartcatch Precure!"                               
##  [2] "Bakumatsu Rock"                                    
##  [3] "Watashi no Coffee Samurai: Jihanki-teki na Kareshi"
##  [4] "Owarimonogatari 2nd Season"                        
##  [5] "Liz to Aoi Tori"                                   
##  [6] "Kuroko no Basket NG-shuu"                          
##  [7] "Tekken: Blood Vengeance"                           
##  [8] "Demi-chan wa Kataritai: Demi-chan no Natsuyasumi"  
##  [9] "Kishin Houkou Demonbane (TV)"                      
## [10] "Magi: Sinbad no Bouken"

#Topic modelling Now we will try to find the topics most watched by the user for which we predicted the anime

library(dplyr)
rec_anime <- as.data.frame(movies_user2)
names(rec_anime)[1] <- "Name"
synopsis_data<- read.csv("/Users/abhibhagupta/Desktop/grad_coursework/sem2/DataMining/data/anime_with_synopsis.csv")
head(synopsis_data)
##   MAL_ID                            Name Score
## 1      1                    Cowboy Bebop  8.78
## 2      5 Cowboy Bebop: Tengoku no Tobira  8.39
## 3      6                          Trigun  8.24
## 4      7              Witch Hunter Robin  7.27
## 5      8                  Bouken Ou Beet  6.98
## 6     15                    Eyeshield 21  7.95
##                                                Genres
## 1     Action, Adventure, Comedy, Drama, Sci-Fi, Space
## 2               Action, Drama, Mystery, Sci-Fi, Space
## 3   Action, Sci-Fi, Adventure, Comedy, Drama, Shounen
## 4 Action, Mystery, Police, Supernatural, Drama, Magic
## 5           Adventure, Fantasy, Shounen, Supernatural
## 6                     Action, Sports, Comedy, Shounen
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 sypnopsis
## 1                 In the year 2071, humanity has colonized several of the planets and moons of the solar system leaving the now uninhabitable surface of planet Earth behind. The Inter Solar System Police attempts to keep peace in the galaxy, aided in part by outlaw bounty hunters, referred to as "Cowboys." The ragtag team aboard the spaceship Bebop are two such individuals. Mellow and carefree Spike Spiegel is balanced by his boisterous, pragmatic partner Jet Black as the pair makes a living chasing bounties and collecting rewards. Thrown off course by the addition of new members that they meet in their travels—Ein, a genetically engineered, highly intelligent Welsh Corgi; femme fatale Faye Valentine, an enigmatic trickster with memory loss; and the strange computer whiz kid Edward Wong—the crew embarks on thrilling adventures that unravel each member's dark and mysterious past little by little. Well-balanced with high density action and light-hearted comedy, Cowboy Bebop is a space Western classic and an homage to the smooth and improvised music it is named after.
## 2                                                                                                                                                                                                                               other day, another bounty—such is the life of the often unlucky crew of the Bebop. However, this routine is interrupted when Faye, who is chasing a fairly worthless target on Mars, witnesses an oil tanker suddenly explode, causing mass hysteria. As casualties mount due to a strange disease spreading through the smoke from the blast, a whopping three hundred million woolong price is placed on the head of the supposed perpetrator. With lives at stake and a solution to their money problems in sight, the Bebop crew springs into action. Spike, Jet, Faye, and Edward, followed closely by Ein, split up to pursue different leads across Alba City. Through their individual investigations, they discover a cover-up scheme involving a pharmaceutical company, revealing a plot that reaches much further than the ragtag team of bounty hunters could have realized.
## 3                                                                           Vash the Stampede is the man with a $$60,000,000,000 bounty on his head. The reason: he's a merciless villain who lays waste to all those that oppose him and flattens entire cities for fun, garnering him the title "The Humanoid Typhoon." He leaves a trail of death and destruction wherever he goes, and anyone can count themselves dead if they so much as make eye contact—or so the rumors say. In actuality, Vash is a huge softie who claims to have never taken a life and avoids violence at all costs. With his crazy doughnut obsession and buffoonish attitude in tow, Vash traverses the wasteland of the planet Gunsmoke, all the while followed by two insurance agents, Meryl Stryfe and Milly Thompson, who attempt to minimize his impact on the public. But soon, their misadventures evolve into life-or-death situations as a group of legendary assassins are summoned to bring about suffering to the trio. Vash's agonizing past will be unraveled and his morality and principles pushed to the breaking point.
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     ches are individuals with special powers like ESP, telekinesis, mind control, etc. Robin, a 15-year-old craft user, arrives from Italy to Japan to work for an organization named STN Japan Division (STN-J) as a replacement for one of STN-J's witch hunters who was recently killed. Unlike other divisions of STN, STN-J tries to capture the witches alive in order to learn why and how they became witches in the first place. (Source: ANN)
## 5                                                                                                                                                                                                                                                                                                                                                                       It is the dark century and the people are suffering under the rule of the devil, Vandel, who is able to manipulate monsters. The Vandel Busters are a group of people who hunt these devils, and among them, the Zenon Squad is known to be the strongest busters on the continent. A young boy, Beet, dreams of joining the Zenon Squad. However, one day, as a result of Beet's fault, the Zenon squad was defeated by the devil, Beltose. The five dying busters sacrificed their life power into their five weapons, Saiga. After giving their weapons to Beet, they passed away. Years have passed since then and the young Vandel Buster, Beet, begins his adventure to carry out the Zenon Squad's will to put an end to the dark century.
## 6 Sena is like any other shy kid starting high school; he's just trying to survive. Constantly bullied, he's accustomed to running away. Surviving high school is about to become a lot more difficult after Hiruma, captain of the school's American football team, witnesses Sena's incredible agility and speed during an escape from some bullies. Hiruma schemes to make Sena the running back of his school team, The Devil Bats, hoping that it will turn around the squad's fortunes from being the laughingstock of Japan's high school leagues, to title contender. To protect his precious star player from rivaling recruiters, he enlists Sena as "team secretary," giving him a visored helmet and the nickname "Eyeshield 21" to hide his identity. The Devilbats will look to make their way to the Christmas Bowl, an annual tournament attended by the best football teams in Japan, with "Eyeshield 21" leading the way. Will they be able to win the Christmas Bowl? Will Sena be able to transform from a timid, undersized freshman to an all-star player? Put on your pads and helmet to find out!
left_join_df <- merge(rec_anime, synopsis_data, by = "Name", all.x = TRUE)
head(left_join_df)
##                                               Name MAL_ID Score
## 1                                   Bakumatsu Rock  23037  6.05
## 2 Demi-chan wa Kataritai: Demi-chan no Natsuyasumi  35823  7.53
## 3                              Heartcatch Precure!   7645  7.79
## 4                     Kishin Houkou Demonbane (TV)   1067  6.58
## 5                         Kuroko no Basket NG-shuu  15487  7.69
## 6                                  Liz to Aoi Tori  35677  8.22
##                                                          Genres
## 1                     Action, Music, Comedy, Historical, Shoujo
## 2                      Comedy, Vampire, Fantasy, School, Seinen
## 3 Action, Slice of Life, Comedy, Magic, Fantasy, School, Shoujo
## 4                   Action, Harem, Magic, Romance, Ecchi, Mecha
## 5                               Comedy, School, Shounen, Sports
## 6                                          Drama, Music, School
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    sypnopsis
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                   ouma Sakamoto wants everyone to know about his passion for rock 'n' roll, so he roams around town with his electric guitar willing to show anyone he encounters that he's just as skilled as the famous Shinsengumi stars they admire. Unfortunately, Japan doesn't allow anything other than that group's Heaven's Songs, for writing or performing different types of music is forbidden and can lead to harsh consequences. Agitated by these strict rules and brainwashing, Ryouma does everything he can to show people that the music he loves will bring them the freedom they deserve. Along with his bandmates Shinsaku Takasugi and Kogoru Katsura, Ryouma works hard to find places for his rock 'n' roll group to perform. Refusing to back down until their music is accepted in Japan, the trio begin to realize that there's more to their passion than they had thought.
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          During summer break, Tetsuo asks Sakie to join him on patrol duty for the local summer festival, which both the demis and Himari and her friends go to. As each group decides to check out a nearby shrine, they get spooked by some strange occurrences, which turn out to be each other. As the next term begins, Tetsuo converses with an invisible woman named Matsuri. (Source: Crunchyroll)
## 3                                                                                                                                                                                                                                                                                                                                                                     Young flower enthusiast Tsubomi Hanasaki is often modest and quiet. But with her family moving to a new town, she aims to reinvent her image at her new school as someone more confident and outgoing. On moving day, she dreams of a mysterious tree in the sky guarded by a warrior named "Cure Moonlight." Tsubomi quickly learns that this was no ordinary dream when she encounters two mysterious fairies—Chypre and Coffret—who are being hunted down by a strange woman. When the woman summons a giant monster to attack the city, Tsubomi finds herself transforming into a warrior to fight the enemy! Taking on the alias "Cure Blossom," Tsubomi learns that the woman is part of a villainous group that aims to turn the world into a lifeless desert, with her new duty being to stop it from happening. As Tsubomi continues to battle more monsters and uncover the secrets behind Cure Moonlight, will she find the confidence needed to overcome her timid nature?
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Kurou Daijuuji is a poor detective living in Arkham City. One day, he was requested by Ruri Hado of Hado Financial Group, to search for a magic book. While he initially refused, Ruri offered him a large sum of money upon completion of her request, in which bribed Kuro to accept. As Kurou searches for the book, he unexpectedly runs into Al, a pretty girl that is actually a powerful grimoire. They forge a contract with each other, bestowing Kuro with powerful magic. Soon afterwards, Al also activates Demonbane, a deus machina owned by the Hado Financial Group, to combat the mechanical menace from the Black Lodge. With this, the war between the Hado Financial Group and the Black Lodge begins.... (Source: ANN)
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          mated bloopers, based on the extra section of the original manga series, included with the BD/DVD series for Kuroko no Basket (both limited and normal editions).
## 6 z's days of solitude come to an end when she meets a blue bird in the form of a young girl. Although their relationship blossoms, Liz must make a heart-wrenching decision in order to truly realize her love for Blue Bird. High school seniors and close friends Mizore Yoroizuka and Nozomi Kasaki are tasked to play the lead instruments in the third movement of Liz and the Blue Bird, a concert band piece inspired by this fairy tale. The introverted and reserved Mizore plays the oboe, representing the kind and gentle Liz. Meanwhile, the radiant and popular Nozomi plays the flute, portraying the cheerful and energetic Blue Bird. However, as they rehearse, the distance between Mizore and Nozomi seems to grow. Their disjointed duet disappoints the band, and with graduation on the horizon, uncertainty about the future spurs complicated emotions. With little time to improve as their performance draws near, they desperately attempt to connect with their respective characters. But when Mizore and Nozomi consider the story from a brand-new perspective, will the girls find the strength to face harsh realities? A spin-off film adaptation of the Hibike Euphonium! series, Liz to Aoi Tori dances between the parallels of a charming fairy tale, a moving musical piece, and a delicate high school friendship.
#install.packages("tidytext")
library(tidyverse) # general utility & workflow functions
library(tidytext) # tidy implimentation of NLP methods
library(topicmodels) # for LDA topic modelling 
library(tm) # general text mining functions, making document term matrixes
library(SnowballC) #
library(stringi)
# function to get & plot the most informative terms by a specificed number
# of topics, using LDA
top_terms_by_topic_LDA <- function(input_text, # should be a columm from a dataframe
                                   plot = T, # return a plot? TRUE by defult
                                   number_of_topics = 4) # number of topics (4 by default)
{    
    # create a corpus (type of object expected by tm) and document term matrix
    Corpus <- Corpus(VectorSource(input_text)) # make a corpus object
    DTM <- DocumentTermMatrix(Corpus) # get the count of words/document

    # remove any empty rows in our document term matrix (if there are any 
    # we'll get an error when we try to run our LDA)
    unique_indexes <- unique(DTM$i) # get the index of each unique value
    DTM <- DTM[unique_indexes,] # get a subset of only those indexes
    
    # preform LDA & get the words/topic in a tidy text format
    lda <- LDA(DTM, k = number_of_topics, control = list(seed = 1234))
    topics <- tidy(lda, matrix = "beta")

    # get the top ten terms for each topic
    top_terms <- topics  %>% # take the topics data frame and..
      group_by(topic) %>% # treat each topic as a different group
      top_n(10, beta) %>% # get the top 10 most informative words
      ungroup() %>% # ungroup
      arrange(topic, -beta) # arrange words in descending informativeness
 # if the user asks for a plot (TRUE by default)
    if(plot == T){
        # plot the top ten terms for each topic in order
        top_terms %>% # take the top terms
          mutate(term = reorder(term, beta)) %>% # sort terms by beta value 
          ggplot(aes(term, beta, fill = factor(topic))) + # plot beta by theme
          geom_col(show.legend = FALSE) + # as a bar plot
          facet_wrap(~ topic, scales = "free") + # which each topic in a seperate plot
          labs(x = NULL, y = "Beta") + # no x label, change y label 
          coord_flip() # turn bars sideways
    }else{ 
        # if the user does not request a plot
        # return a list of sorted terms instead
        return(top_terms)
    }
}
# create a document term matrix to clean
reviewsCorpus <- Corpus(VectorSource(left_join_df$sypnopsis)) 
reviewsDTM <- DocumentTermMatrix(reviewsCorpus)

# convert the document term matrix to a tidytext corpus
reviewsDTM_tidy <- tidy(reviewsDTM)

# I'm going to add my own custom stop words that I don't think will be
# very informative in hotel reviews
custom_stop_words <- tibble(word = c("hotel", "room"))

# remove stopwords
reviewsDTM_tidy_cleaned <- reviewsDTM_tidy %>% # take our tidy dtm and...
    anti_join(stop_words, by = c("term" = "word")) %>% # remove English stopwords and...
    anti_join(custom_stop_words, by = c("term" = "word")) # remove my custom stopwords

# reconstruct cleaned documents (so that each word shows up the correct number of times)
cleaned_documents <- reviewsDTM_tidy_cleaned %>%
    group_by(document) %>% 
    mutate(terms = toString(rep(term, count))) %>%
    select(document, terms) %>%
    unique()
library(topicmodels)

top_terms_by_topic_LDA(cleaned_documents$terms, number_of_topics = 10)

#install.packages("wordcloud")
library(wordcloud)
#install.packages("RColorBrewer")
library(RColorBrewer)
#install.packages("wordcloud2")
library(wordcloud2)
docs <- Corpus(VectorSource(left_join_df$sypnopsis))
docs
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 10
docs <- docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))
dtm <- TermDocumentMatrix(docs) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)
library(tm)
library(tmap)
#install.packages("tmaptools")
#install.packages("leafem")
library(wordcloud)

word cloud of most frequent topics watched by user 1

set.seed(1234) # for reproducibility 
wordcloud(words = df$word, freq = df$freq, min.freq = 10, max.words=100, random.order=FALSE, rot.per=0.35,            colors=brewer.pal(8, "Dark2"), scale=c(2, 0.01))